--- layout: post title: 2021-03-30-Mobile Games AB testing1 tags: [plotly,python,jupyter,jekyll] ---
#plotly is used for interactive web-based visualizations
#conda install -c plotly plotly
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv('/Users/minyan/Desktop/Python Project/AB testing_interactive display/Datasets/mobilegames_cookie_cats.csv')
df.head()
| userid | version | sum_gamerounds | retention_1 | retention_7 | |
|---|---|---|---|---|---|
| 0 | 116 | gate_30 | 3 | False | False |
| 1 | 337 | gate_30 | 38 | True | False |
| 2 | 377 | gate_40 | 165 | True | False |
| 3 | 483 | gate_40 | 1 | False | False |
| 4 | 488 | gate_40 | 179 | True | True |
#check missing values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90189 entries, 0 to 90188 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 userid 90189 non-null int64 1 version 90189 non-null object 2 sum_gamerounds 90189 non-null int64 3 retention_1 90189 non-null bool 4 retention_7 90189 non-null bool dtypes: bool(2), int64(2), object(1) memory usage: 2.2+ MB
#list how many null values for each feature
print(df.isnull().sum().sort_values(ascending=False))
retention_7 0 retention_1 0 sum_gamerounds 0 version 0 userid 0 dtype: int64
#count the number of players in each group
df.groupby('version').count()
| userid | sum_gamerounds | retention_1 | retention_7 | |
|---|---|---|---|---|
| version | ||||
| gate_30 | 44700 | 44700 | 44700 | 44700 |
| gate_40 | 45489 | 45489 | 45489 | 45489 |
df.groupby('version')['sum_gamerounds'].mean()
version gate_30 52.456264 gate_40 51.298776 Name: sum_gamerounds, dtype: float64
df['sum_gamerounds'].describe()
count 90189.000000 mean 51.872457 std 195.050858 min 0.000000 25% 5.000000 50% 16.000000 75% 51.000000 max 49854.000000 Name: sum_gamerounds, dtype: float64
fig= px.box(df['sum_gamerounds'],y='sum_gamerounds')
fig.show()
#counting the number of players for each of gamerounds
plot_df = df.groupby('sum_gamerounds')['userid'].count()
plot_df
sum_gamerounds
0 3994
1 5538
2 4606
3 3958
4 3629
...
2294 1
2438 1
2640 1
2961 1
49854 1
Name: userid, Length: 942, dtype: int64
plot_ga=df[df['version']=='gate_30'].groupby('sum_gamerounds')['userid'].count()
plot_gb=df[df['version']=='gate_40'].groupby('sum_gamerounds')['userid'].count()
bins= [0,10,20,30,40,50,60,70,80,90, 100, 200, 400, 600]
# prepare data
trace1 = go.Histogram(
x=plot_ga,
opacity=0.75,
name = 'gate_30',
marker = dict(color ='rgba(171,50,97,0.6)'))
trace2 = go.Histogram(
x=plot_gb,
opacity=0.75,
name = 'gate_40',
marker = dict(color = 'rgba(12,50,196,0.6)'))
da = [trace1, trace2]
lay = go.Layout(barmode = 'overlay',
title = 'gate_30 vs. gate_40',
xaxis = dict(title ='Number of players for each of gamerounds'),
yaxis = dict(title = 'Count'))
fig = go.Figure(data=da, layout=lay)
fig.show()
#plot the distribution of players that played 0 to 100 game rounds
#prepare the dataframe
plot_df = df.groupby('sum_gamerounds')['userid'].count()
da = plot_df[:101]
lay = go.Layout()
fig=px.line(da)
fig.update_layout(title = 'the number of players that played the 0-100 game rounds during the first week',
showlegend = False,
xaxis = dict(title ='the number of players for each of gamerounds'),
yaxis = dict(title = 'Counts')
)
fig.show()
#Null hypothesis: the difference of conversion rate between a/b group is by chance
#Alternative hypothesis: conversion rate of group a is statistical significant larger then group b
#overall one-day retention
#A common metric measuign how fun and engaging a game is 1-day retention
#calculate p_pool
p_pool = df['retention_1'].sum()/df['retention_1'].count()
#A/B test retention for each AB group
df.groupby('version')['retention_1'].mean()
version gate_30 0.448188 gate_40 0.442283 Name: retention_1, dtype: float64
#Solution 1:calculate the mean difference
p_diff=df[df['version']== "gate_30"]['retention_1'].mean()-df[df['version']== "gate_40"]['retention_1'].mean()
p_diff
0.005905169787341458
#calculate pooled standard error
count1=df[df['version']== "gate_30"]['retention_1'].count()
count2=df[df['version']== "gate_40"]['retention_1'].count()
se_pool = np.sqrt(p_pool*(1-p_pool)*(1/count1+1/count2))
se_pool
0.0033099127751024513
#for 95% confidence interval the value of Z is 1.96 or we can use pcipy package to calculate it
from scipy.stats import norm
alpha=0.05
z=round(norm.ppf(1-alpha/2),2)
#calculate marginal error
marginal_error = round((z*se_pool),4)
marginal_error
lb=p_diff-marginal_error
ub=p_diff+marginal_error
if lb>0:
print('Reject null hypothesis.')
else:
print('Do not reject null hypothesis')
Do not reject null hypothesis
#Solution 2: Bootstrapping: should we be confident in the difference?
#predict the statistics under the null hypothesis
#create a list with bootstrappwd means for each AB-group
boot_1d=[]
for i in range(1000):
boot_mean=df.sample(frac = 1, replace = True).groupby('version')['retention_1'].mean()
boot_1d.append(boot_mean)
#transform the list to a Dataframe
boot_1d=pd.DataFrame(boot_1d)
print(boot_1d)
version gate_30 gate_40 retention_1 0.450574 0.441819 retention_1 0.450607 0.440877 retention_1 0.449638 0.442674 retention_1 0.448195 0.439917 retention_1 0.448630 0.441884 ... ... ... retention_1 0.449823 0.439831 retention_1 0.448654 0.443252 retention_1 0.450203 0.445702 retention_1 0.449211 0.441874 retention_1 0.449153 0.439267 [1000 rows x 2 columns]
# A kernel Density estimate plot of the boostrap distributiona
# Use distplot for density curve, along with Pandas
import plotly.figure_factory as ff
fig1 = ff.create_distplot([boot_1d[c] for c in boot_1d.columns],
boot_1d.columns,
show_rug=False, show_hist=False)
fig1.update_layout(title_text = 'A kernel density plot of the boostrap distribution')
fig1.show()
#add a column with the difference between AB group
boot_1d['diff'] = (boot_1d.gate_30 - boot_1d.gate_40)/boot_1d.gate_40*100
#plot the bootstrap % difference
da = pd.DataFrame(boot_1d['diff'])
fig2 = ff.create_distplot( [da[c] for c in da.columns], da.columns,
show_rug=False, show_hist=False)
fig2.add_shape(type='line',
x0=p_diff, y0=-0.01, x1=p_diff, y1=0.6,
line=dict(color='red',width=2)
)
fig2.add_annotation(
x=p_diff,
y=0.61,
showarrow=False,
text= p_diff)
fig2.update_layout(title_text = '%difference in 1-day retention between AB groups',
showlegend = False,
xaxis = dict(title ='Percentage of Difference'),
yaxis = dict(title = 'Density')
)
fig2.show()
#calculate the probablity that 1-day retention is greater when the gate is at level 30
print((boot_1d['diff']>p_diff).mean())
print('Probablity that 1-day retention is greater than observed difference when the gate is at level 30:', (boot_1d['diff']>p_diff).mean())
0.961 Probablity that 1-day retention is greater than observed difference when the gate is at level 30: 0.961
print('We cant reject Ho because p-value(0.961>0.05), the difference is insignificant.')
We cant reject Ho because p-value(0.961>0.05), the difference is insignificant.
#import plotly.io as pio
#pio.write_html(fig, file='index.html', auto_open=True)